Chap 5 - EDA

Imports

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from scipy import stats
import warnings
import ydata_profiling as pp 

pd.set_option('display.max_columns', 50)
warnings.filterwarnings("ignore")

Load the Dataset

file_url = "data/vehicles.csv.zip"
fueleco = pd.read_csv(file_url)
fueleco
barrels08 barrelsA08 charge120 charge240 city08 city08U cityA08 cityA08U cityCD cityE cityUF co2 co2A co2TailpipeAGpm co2TailpipeGpm comb08 comb08U combA08 combA08U combE combinedCD combinedUF cylinders displ drive ... UCity UCityA UHighway UHighwayA VClass year youSaveSpend guzzler trans_dscr tCharger sCharger atvType fuelType2 rangeA evMotor mfrCode c240Dscr charge240b c240bDscr createdOn modifiedOn startStop phevCity phevHwy phevComb
0 15.695714 0.0 0.0 0.0 19 0.0 0 0.0 0.0 0.0 0.0 -1 -1 0.0 423.190476 21 0.0 0 0.0 0.0 0.0 0.0 4.0 2.0 Rear-Wheel Drive ... 23.3333 0.0 35.0000 0.0 Two Seaters 1985 -1750 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 NaN Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN 0 0 0
1 29.964545 0.0 0.0 0.0 9 0.0 0 0.0 0.0 0.0 0.0 -1 -1 0.0 807.909091 11 0.0 0 0.0 0.0 0.0 0.0 12.0 4.9 Rear-Wheel Drive ... 11.0000 0.0 19.0000 0.0 Two Seaters 1985 -10500 T NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 NaN Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN 0 0 0
2 12.207778 0.0 0.0 0.0 23 0.0 0 0.0 0.0 0.0 0.0 -1 -1 0.0 329.148148 27 0.0 0 0.0 0.0 0.0 0.0 4.0 2.2 Front-Wheel Drive ... 29.0000 0.0 47.0000 0.0 Subcompact Cars 1985 250 NaN SIL NaN NaN NaN NaN NaN NaN NaN NaN 0.0 NaN Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN 0 0 0
3 29.964545 0.0 0.0 0.0 10 0.0 0 0.0 0.0 0.0 0.0 -1 -1 0.0 807.909091 11 0.0 0 0.0 0.0 0.0 0.0 8.0 5.2 Rear-Wheel Drive ... 12.2222 0.0 16.6667 0.0 Vans 1985 -10500 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 NaN Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN 0 0 0
4 17.347895 0.0 0.0 0.0 17 0.0 0 0.0 0.0 0.0 0.0 -1 -1 0.0 467.736842 19 0.0 0 0.0 0.0 0.0 0.0 4.0 2.2 4-Wheel or All-Wheel Drive ... 21.0000 0.0 32.0000 0.0 Compact Cars 1993 -4750 NaN NaN T NaN NaN NaN NaN NaN NaN NaN 0.0 NaN Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
39096 14.982273 0.0 0.0 0.0 19 0.0 0 0.0 0.0 0.0 0.0 -1 -1 0.0 403.954545 22 0.0 0 0.0 0.0 0.0 0.0 4.0 2.2 Front-Wheel Drive ... 24.0000 0.0 37.0000 0.0 Compact Cars 1993 -1500 NaN CLKUP NaN NaN NaN NaN NaN NaN NaN NaN 0.0 NaN Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN 0 0 0
39097 14.330870 0.0 0.0 0.0 20 0.0 0 0.0 0.0 0.0 0.0 -1 -1 0.0 386.391304 23 0.0 0 0.0 0.0 0.0 0.0 4.0 2.2 Front-Wheel Drive ... 25.0000 0.0 39.0000 0.0 Compact Cars 1993 -1000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 NaN Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN 0 0 0
39098 15.695714 0.0 0.0 0.0 18 0.0 0 0.0 0.0 0.0 0.0 -1 -1 0.0 423.190476 21 0.0 0 0.0 0.0 0.0 0.0 4.0 2.2 4-Wheel or All-Wheel Drive ... 23.0000 0.0 34.0000 0.0 Compact Cars 1993 -1750 NaN CLKUP NaN NaN NaN NaN NaN NaN NaN NaN 0.0 NaN Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN 0 0 0
39099 15.695714 0.0 0.0 0.0 18 0.0 0 0.0 0.0 0.0 0.0 -1 -1 0.0 423.190476 21 0.0 0 0.0 0.0 0.0 0.0 4.0 2.2 4-Wheel or All-Wheel Drive ... 23.0000 0.0 34.0000 0.0 Compact Cars 1993 -1750 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 NaN Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN 0 0 0
39100 18.311667 0.0 0.0 0.0 16 0.0 0 0.0 0.0 0.0 0.0 -1 -1 0.0 493.722222 18 0.0 0 0.0 0.0 0.0 0.0 4.0 2.2 4-Wheel or All-Wheel Drive ... 20.0000 0.0 29.0000 0.0 Compact Cars 1993 -5500 NaN CLKUP T NaN NaN NaN NaN NaN NaN NaN 0.0 NaN Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN 0 0 0

39101 rows × 83 columns

fueleco.select_dtypes(exclude="object").mean()
fueleco.select_dtypes(exclude="object").std()
fueleco.select_dtypes(exclude="object").info()
fueleco.select_dtypes(exclude=["object", "bool"]).quantile(
    [0, 0.25, 0.5, 0.75, 1]
)
fueleco.select_dtypes(exclude=["object", "bool"]).describe().T
fueleco.describe(include=object)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39101 entries, 0 to 39100
Data columns (total 60 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   barrels08        39101 non-null  float64
 1   barrelsA08       39101 non-null  float64
 2   charge120        39101 non-null  float64
 3   charge240        39101 non-null  float64
 4   city08           39101 non-null  int64  
 5   city08U          39101 non-null  float64
 6   cityA08          39101 non-null  int64  
 7   cityA08U         39101 non-null  float64
 8   cityCD           39101 non-null  float64
 9   cityE            39101 non-null  float64
 10  cityUF           39101 non-null  float64
 11  co2              39101 non-null  int64  
 12  co2A             39101 non-null  int64  
 13  co2TailpipeAGpm  39101 non-null  float64
 14  co2TailpipeGpm   39101 non-null  float64
 15  comb08           39101 non-null  int64  
 16  comb08U          39101 non-null  float64
 17  combA08          39101 non-null  int64  
 18  combA08U         39101 non-null  float64
 19  combE            39101 non-null  float64
 20  combinedCD       39101 non-null  float64
 21  combinedUF       39101 non-null  float64
 22  cylinders        38956 non-null  float64
 23  displ            38958 non-null  float64
 24  engId            39101 non-null  int64  
 25  feScore          39101 non-null  int64  
 26  fuelCost08       39101 non-null  int64  
 27  fuelCostA08      39101 non-null  int64  
 28  ghgScore         39101 non-null  int64  
 29  ghgScoreA        39101 non-null  int64  
 30  highway08        39101 non-null  int64  
 31  highway08U       39101 non-null  float64
 32  highwayA08       39101 non-null  int64  
 33  highwayA08U      39101 non-null  float64
 34  highwayCD        39101 non-null  float64
 35  highwayE         39101 non-null  float64
 36  highwayUF        39101 non-null  float64
 37  hlv              39101 non-null  int64  
 38  hpv              39101 non-null  int64  
 39  id               39101 non-null  int64  
 40  lv2              39101 non-null  int64  
 41  lv4              39101 non-null  int64  
 42  phevBlended      39101 non-null  bool   
 43  pv2              39101 non-null  int64  
 44  pv4              39101 non-null  int64  
 45  range            39101 non-null  int64  
 46  rangeCity        39101 non-null  float64
 47  rangeCityA       39101 non-null  float64
 48  rangeHwy         39101 non-null  float64
 49  rangeHwyA        39101 non-null  float64
 50  UCity            39101 non-null  float64
 51  UCityA           39101 non-null  float64
 52  UHighway         39101 non-null  float64
 53  UHighwayA        39101 non-null  float64
 54  year             39101 non-null  int64  
 55  youSaveSpend     39101 non-null  int64  
 56  charge240b       39101 non-null  float64
 57  phevCity         39101 non-null  int64  
 58  phevHwy          39101 non-null  int64  
 59  phevComb         39101 non-null  int64  
dtypes: bool(1), float64(32), int64(27)
memory usage: 17.6 MB
drive eng_dscr fuelType fuelType1 make model mpgData trany VClass guzzler trans_dscr tCharger sCharger atvType fuelType2 rangeA evMotor mfrCode c240Dscr c240bDscr createdOn modifiedOn startStop
count 37912 23431 39101 39101 39101 39101 39101 39090 39101 2335 15047 5816 738 3204 1490 1485 644 8289 53 51 39101 39101 7405
unique 7 545 14 6 134 3816 2 37 34 3 52 1 1 8 4 216 125 46 5 4 195 68 2
top Front-Wheel Drive (FFS) Regular Regular Gasoline Chevrolet F150 Pickup 2WD N Automatic 4-spd Compact Cars G CLKUP T S FFV E85 290 288V Ni-MH GMX standard charger 80 amp dual charger Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 N
freq 13653 8827 25620 26931 3900 214 26400 11045 5661 1356 7809 5816 738 1383 1383 74 117 1255 42 42 34199 29438 5176

Column Types

fueleco.dtypes.value_counts()
fueleco.select_dtypes(include="bool")
phevBlended
0 False
1 False
2 False
3 False
4 False
... ...
39096 False
39097 False
39098 False
39099 False
39100 False

39101 rows × 1 columns

fueleco.select_dtypes("int64").describe().T
count mean std min 25% 50% 75% max
city08 39101.0 18.077799 6.970672 6.0 15.0 17.0 20.0 150.0
cityA08 39101.0 0.569883 4.297124 0.0 0.0 0.0 0.0 145.0
co2 39101.0 72.538989 163.252019 -1.0 -1.0 -1.0 -1.0 847.0
co2A 39101.0 5.543950 55.956932 -1.0 -1.0 -1.0 -1.0 713.0
comb08 39101.0 20.323828 6.882807 7.0 17.0 20.0 23.0 136.0
combA08 39101.0 0.631160 4.395797 0.0 0.0 0.0 0.0 133.0
engId 39101.0 8582.377382 17606.675590 0.0 0.0 202.0 4401.0 69102.0
feScore 39101.0 0.122580 2.516348 -1.0 -1.0 -1.0 -1.0 10.0
fuelCost08 39101.0 2242.470781 601.273869 500.0 1850.0 2250.0 2500.0 6850.0
fuelCostA08 39101.0 91.335260 479.485802 0.0 0.0 0.0 0.0 3850.0
ghgScore 39101.0 0.120866 2.512612 -1.0 -1.0 -1.0 -1.0 10.0
ghgScoreA 39101.0 -0.923889 0.651017 -1.0 -1.0 -1.0 -1.0 8.0
highway08 39101.0 24.208588 7.128070 9.0 20.0 24.0 27.0 122.0
highwayA08 39101.0 0.736452 4.694207 0.0 0.0 0.0 0.0 121.0
hlv 39101.0 2.029539 5.959735 0.0 0.0 0.0 0.0 49.0
hpv 39101.0 10.411243 28.167271 0.0 0.0 0.0 0.0 195.0
id 39101.0 19662.541188 11413.329199 1.0 9776.0 19552.0 29555.0 39483.0
lv2 39101.0 1.834812 4.407887 0.0 0.0 0.0 0.0 41.0
lv4 39101.0 6.155930 9.698101 0.0 0.0 0.0 13.0 55.0
pv2 39101.0 13.649574 31.214466 0.0 0.0 0.0 0.0 194.0
pv4 39101.0 33.883711 45.991687 0.0 0.0 0.0 91.0 192.0
range 39101.0 0.500243 9.742080 0.0 0.0 0.0 0.0 335.0
year 39101.0 2000.635406 10.690422 1984.0 1991.0 2001.0 2010.0 2018.0
youSaveSpend 39101.0 -3459.572645 3010.284617 -26500.0 -4750.0 -3500.0 -1500.0 5250.0
phevCity 39101.0 0.094703 2.279478 0.0 0.0 0.0 0.0 97.0
phevHwy 39101.0 0.094269 2.191115 0.0 0.0 0.0 0.0 81.0
phevComb 39101.0 0.094141 2.226500 0.0 0.0 0.0 0.0 88.0
np.iinfo(np.int16)
fueleco[["city08", "comb08"]].info(memory_usage="deep")
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39101 entries, 0 to 39100
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   city08  39101 non-null  int64
 1   comb08  39101 non-null  int64
dtypes: int64(2)
memory usage: 611.1 KB
(
    fueleco[["city08", "comb08"]]
    .assign(
        city08=fueleco.city08.astype(np.int16),
        comb08=fueleco.comb08.astype(np.int16),
    )
    .info(memory_usage="deep")
)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39101 entries, 0 to 39100
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   city08  39101 non-null  int16
 1   comb08  39101 non-null  int16
dtypes: int16(2)
memory usage: 152.9 KB
fueleco[["make"]].info(memory_usage="deep")
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39101 entries, 0 to 39100
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   make    39101 non-null  object
dtypes: object(1)
memory usage: 2.4 MB
(
    fueleco[["make"]]
    .assign(make=fueleco.make.astype("category"))
    .info(memory_usage="deep")
)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39101 entries, 0 to 39100
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   make    39101 non-null  category
dtypes: category(1)
memory usage: 89.5 KB

Categorical data

fueleco.select_dtypes(object).columns
fueleco.drive.nunique()
fueleco.drive.sample(5, random_state=42)
fueleco.drive.isna().mean() * 100
fueleco.drive.value_counts(dropna=False)
drive
Front-Wheel Drive             13653
Rear-Wheel Drive              13284
4-Wheel or All-Wheel Drive     6648
All-Wheel Drive                2401
4-Wheel Drive                  1221
NaN                            1189
2-Wheel Drive                   507
Part-time 4-Wheel Drive         198
Name: count, dtype: int64
top_n = fueleco.make.value_counts().index[:6]
(
    fueleco
    .assign(
        make=fueleco.make.where(
            fueleco.make.isin(top_n), "Other"
        )
    )
).make.value_counts(dropna=False)
make
Other        23211
Chevrolet     3900
Ford          3208
Dodge         2557
GMC           2442
Toyota        1976
BMW           1807
Name: count, dtype: int64
fig, ax = plt.subplots(figsize=(10, 8))
top_n = fueleco.make.value_counts().index[:6]
(
    fueleco
    .assign(
        make=fueleco.make.where(
            fueleco.make.isin(top_n), "Other"
        )
    )
    .make.value_counts(dropna=False)
    .plot.bar(ax=ax)
)
fig.savefig("c5-catpan.png", dpi=300)

fig, ax = plt.subplots(figsize=(10,8))
top_n = fueleco.make.value_counts().index[:6]
sns.countplot(
    y="make",
    data=(
        fueleco
        .assign(make=fueleco.make.where(fueleco.make.isin(top_n), "Other"))
    )
)
fig.savefig("c5-catsns.png", dpi=300)

fueleco.rangeA.value_counts()
rangeA
290            74
270            56
280            53
310            41
277            38
               ..
351             1
240/290/290     1
303             1
340-350         1
341             1
Name: count, Length: 216, dtype: int64
fueleco.rangeA
(
    fueleco.rangeA.str.extract(r"([^0-9.])")
    .dropna()
    .apply(lambda row: "".join(row), axis=1)
    .value_counts()
)
/    280
-     71
Name: count, dtype: int64
set(fueleco.rangeA.apply(type))
fueleco.rangeA.isna().sum()
37616
(
    fueleco.rangeA
    .fillna("0")
    .str.replace("-", "/")
    .str.split("/", expand=True)
    .astype(float)
    .mean(axis=1)
    .pipe(lambda ser_: pd.cut(ser_, 10))
    .value_counts()
    # .pipe(lambda s_: s_.where(s_< 100))
    # .dropna()
    # .sort_values()
)
(-0.45, 44.95]     37688
(269.7, 314.65]      559
(314.65, 359.6]      352
(359.6, 404.55]      205
(224.75, 269.7]      181
(404.55, 449.5]       82
(89.9, 134.85]        12
(179.8, 224.75]        9
(44.95, 89.9]          8
(134.85, 179.8]        5
Name: count, dtype: int64

Continuous Data

fueleco.select_dtypes("number")
fueleco.city08.sample(5, random_state=42)
fueleco.city08.isna().sum()
fueleco.city08.describe()
count    39101.000000
mean        18.077799
std          6.970672
min          6.000000
25%         15.000000
50%         17.000000
75%         20.000000
max        150.000000
Name: city08, dtype: float64
fig, ax = plt.subplots(figsize=(10,8))
fueleco.city08.hist(ax=ax, bins=30)
fig.savefig(
    "c5-conthistpan.png",
    dpi=300
)

fig, ax = plt.subplots(figsize=(10,8))
sns.distplot(fueleco.city08, rug=True, ax=ax)
fig.savefig(
    "c5-conthistsns.png",
    dpi=300
)

fig, axs = plt.subplots(nrows=3, figsize=(10,8))
sns.boxplot(x=fueleco.city08, ax=axs[0])
sns.violinplot(x=fueleco.city08, ax=axs[1])
sns.boxenplot(x=fueleco.city08, ax=axs[2])
fig.savefig(
    "c5-contothersns.png",
    dpi=300
)

stats.kstest(fueleco.city08, cdf="norm")
KstestResult(statistic=0.9999999990134123, pvalue=0.0)
fig, ax = plt.subplots(figsize=(10, 8))
stats.probplot(fueleco.city08, plot=ax)
fig.savefig(
    "c5-conprob.png",
    dpi=300
)

Comparing continuous values across categories

mask = fueleco.make.isin(
    ["Ford", "Honda", "Tesla", "BMW"]
)
fueleco[mask].groupby("make").city08.agg(
    ["mean", "std"]
)
mean std
make
BMW 17.817377 7.372907
Ford 16.853803 6.701029
Honda 24.372973 9.154064
Tesla 92.826087 5.538970
g = sns.catplot(
    x="make", y="city08", data=fueleco[mask], kind="box"
)

(fueleco[mask]
.groupby("make")
.city08
.count()
)
make
BMW      1807
Ford     3208
Honda     925
Tesla      46
Name: city08, dtype: int64
g = sns.catplot(
    x="make", y="city08", data=fueleco[mask], kind="box"
)
sns.swarmplot(
    x="make",
    y="city08",
    data=fueleco[mask],
    color="k",
    size=1,
    ax=g.ax,
)
g.ax.figure

g = sns.catplot(
    x="make",
    y="city08",
    data=fueleco[mask],
    kind="box",
    col="year",
    col_order=[2012, 2014, 2016, 2018],
    col_wrap=2,
)

g = sns.catplot(
    x="make",
    y="city08",
    data=fueleco[mask],
    kind="box",
    hue="year",
    hue_order=[2012, 2014, 2016, 2018],
)

(
    fueleco[mask]
    .groupby("make")
    .city08.agg(["mean", "std"])
    .style.background_gradient(cmap="RdBu", axis=0)
)
  mean std
make    
BMW 17.817377 7.372907
Ford 16.853803 6.701029
Honda 24.372973 9.154064
Tesla 92.826087 5.538970

Comparing two continuous columns

fueleco.city08.cov(fueleco.highway08)
fueleco.city08.cov(fueleco.comb08)
fueleco.city08.cov(fueleco.cylinders)
-5.931560263764756
fueleco.city08.corr(fueleco.highway08)
fueleco.city08.corr(fueleco.cylinders)
-0.7016548423827876
fig, ax = plt.subplots(figsize=(8,8))
corr = fueleco[
    ["city08", "highway08", "cylinders"]
].corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(
    corr,
    mask=mask,
    fmt=".2f",
    annot=True,
    ax=ax,
    cmap="RdBu",
    vmin=-1,
    vmax=1,
    square=True

)

fig, ax = plt.subplots(figsize=(8,8))
fueleco.plot.scatter(
    x="city08", y="highway08", alpha=0.1, ax=ax
)
fig.savefig(
    "c5-scatpan.png",
    dpi=300,
    bbox_inches="tight",
)

fig, ax = plt.subplots(figsize=(8,8))
fueleco.plot.scatter(
    x="city08", y="cylinders", alpha=0.1, ax=ax
)

fig, ax = plt.subplots(figsize=(8,8))
(
    fueleco.assign(
        cylinders=fueleco.cylinders.fillna(0)
    ).plot.scatter(
        x="city08", y="cylinders", alpha=0.1, ax=ax
    )
)

res = sns.lmplot(
    x="city08", y="highway08", data=fueleco
)

res = sns.relplot(
    x="city08", 
    y="highway08", 
    data=fueleco.assign(
        cylinders=fueleco.cylinders.fillna(0)
    ),
    hue="year",
    size="barrels08",
    alpha=0.5,
    height=8,
)

res = sns.relplot(
    x="city08", 
    y="highway08", 
    data=fueleco.assign(
        cylinders=fueleco.cylinders.fillna(0)
    ),
    hue="year",
    size="barrels08",
    alpha=0.5,
    height=8,
    col="make",
    col_order=["Ford", "Tesla"]
)

fueleco.city08.corr(
    fueleco.barrels08, method="spearman"
)
-0.9743658763718624

Comparing categorical values with categorical value_counts

def generalize(ser, match_name, default):
    seen = None
    for match, name in match_name:
        mask = ser.str.contains(match)
        if seen is None:
            seen = mask
        else:
            seen |= mask
        ser = ser.where(~mask, name)
    ser = ser.where(seen, default)
    return ser
makes = ["Ford", "Tesla", "BMW", "Toyota"]
data = fueleco[fueleco.make.isin(makes)].assign(
    SClass=lambda df_: generalize(
        df_.VClass, [
            ("Seaters", "Car"),
            ("Car", "Car"),
            ("Utility", "SUV"),
            ("Truck", "Truck"),
            ("Van", "Van"),
            ("van", "Van"),
            ("Wagon", "Wagon"),
        ],
        "other",
    )
)
data.groupby(["make", "SClass"]).size().unstack()
pd.crosstab(data.make, data.SClass)
SClass Car SUV Truck Van Wagon other
make
BMW 1557 158 0 0 92 0
Ford 1075 372 1040 332 155 234
Tesla 36 10 0 0 0 0
Toyota 773 376 478 94 132 123
pd.crosstab(
    [data.year, data.make], [data.SClass, data.VClass]
)
SClass Car SUV Truck Van Wagon other
VClass Compact Cars Large Cars Midsize Cars Minicompact Cars Subcompact Cars Two Seaters Vans, Cargo Type Small Sport Utility Vehicle 2WD Small Sport Utility Vehicle 4WD Sport Utility Vehicle - 2WD Sport Utility Vehicle - 4WD Standard Sport Utility Vehicle 2WD Standard Sport Utility Vehicle 4WD Small Pickup Trucks Small Pickup Trucks 2WD Small Pickup Trucks 4WD Standard Pickup Trucks Standard Pickup Trucks 2WD Standard Pickup Trucks 4WD Minivan - 2WD Minivan - 4WD Vans Vans, Passenger Type Midsize Station Wagons Midsize-Large Station Wagons Small Station Wagons Special Purpose Vehicle 2WD Special Purpose Vehicle 4WD Special Purpose Vehicles
year make
1984 BMW 6 0 0 0 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Ford 33 3 9 0 14 6 26 0 0 0 0 0 0 0 18 11 0 32 26 0 0 0 12 5 2 12 0 21 6
Toyota 13 0 0 3 22 0 4 0 0 0 0 0 0 0 0 0 0 13 3 0 0 0 4 0 0 10 0 3 2
1985 BMW 7 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Ford 31 2 5 0 13 3 0 0 0 0 0 0 0 16 0 0 54 0 0 0 0 26 0 0 3 13 5 9 6
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2017 Tesla 0 8 0 0 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Toyota 3 0 16 2 2 0 0 5 3 0 0 2 10 0 2 5 0 3 3 1 1 0 0 1 0 0 0 0 0
2018 BMW 37 12 5 0 25 1 0 0 2 0 0 2 5 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0
Ford 0 0 8 0 0 0 0 4 2 0 0 1 2 0 1 0 0 12 14 0 0 0 3 0 0 0 6 0 0
Toyota 4 0 16 0 2 0 0 2 3 0 0 2 5 0 0 0 0 2 3 0 0 0 0 0 0 0 0 0 0

111 rows × 29 columns

fig, ax = plt.subplots(figsize=(10,8))
(
    data.pipe(
        lambda df_: pd.crosstab(df_.make, df_.SClass)
    ).plot.bar(ax=ax)
)

res = sns.catplot(
    kind="count", x="make", hue="SClass", data=data
)

fig, ax = plt.subplots(figsize=(10,8))
(
    data.pipe(
        lambda df_: pd.crosstab(df_.make, df_.SClass)
    )
    .pipe(lambda df_: df_.div(df_.sum(axis=1), axis=0))
    .plot.bar(stacked=True, ax=ax)
)

Using the pandas profiling library

For compatibility with pydantic v2 need to pip install ydata-profiling and then import ydata-profiling as pp

report = pp.ProfileReport(fueleco)
report.to_file("fueleco.html")